1 Data

Load the student scores for the test:

Show data summary
skim(test_scores)
Data summary
Name test_scores
Number of rows 8993
Number of columns 28
_______________________
Column type frequency:
character 4
numeric 24
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
test_version 0 1 3 4 0 2 0
year 0 1 7 7 0 9 0
class 8993 0 NA NA 0 0 0
AnonID 0 1 9 9 0 8899 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e1_E1 39 1.00 4.38 1.48 0 5.00 5.0 5.00 5 ▁▁▁▁▇
e2_E0 5500 0.39 4.23 1.73 0 5.00 5.0 5.00 5 ▁▁▁▁▇
e3_E2 211 0.98 3.62 2.24 0 0.00 5.0 5.00 5 ▃▁▁▁▇
e4_E3 132 0.99 4.00 1.43 0 3.00 5.0 5.00 5 ▁▁▂▂▇
e5_E4 375 0.96 4.20 1.83 0 5.00 5.0 5.00 5 ▂▁▁▁▇
e6_E5 1377 0.85 1.99 1.89 0 0.00 2.0 2.00 5 ▇▇▁▁▅
e7_E6 716 0.92 4.00 1.93 0 5.00 5.0 5.00 5 ▂▁▁▁▇
e8_E0 5500 0.39 3.03 2.44 0 0.00 5.0 5.00 5 ▅▁▁▁▇
e9_E8 199 0.98 4.37 1.66 0 5.00 5.0 5.00 5 ▁▁▁▁▇
e10_E9 244 0.97 3.62 1.90 0 2.50 5.0 5.00 5 ▂▁▃▁▇
e11_E0 5500 0.39 3.89 1.67 0 2.50 5.0 5.00 5 ▁▁▂▁▇
e12_E12 371 0.96 4.20 1.74 0 5.00 5.0 5.00 5 ▂▁▁▁▇
e13_E13 304 0.97 3.85 1.85 0 2.00 5.0 5.00 5 ▂▂▁▁▇
e14_E14 576 0.94 2.97 1.91 0 2.00 2.0 5.00 5 ▃▆▁▁▇
e15_E15 352 0.96 3.60 2.01 0 2.50 5.0 5.00 5 ▂▁▂▁▇
e16_E16 270 0.97 4.49 1.51 0 5.00 5.0 5.00 5 ▁▁▁▁▇
e17_E17 200 0.98 4.62 1.33 0 5.00 5.0 5.00 5 ▁▁▁▁▇
e18_E18 367 0.96 3.77 2.15 0 5.00 5.0 5.00 5 ▂▁▁▁▇
e19_E19 1044 0.88 3.17 2.41 0 0.00 5.0 5.00 5 ▅▁▁▁▇
e20_E20 920 0.90 2.37 2.50 0 0.00 0.0 5.00 5 ▇▁▁▁▇
Total 0 1.00 69.65 21.55 0 58.50 74.0 85.50 100 ▁▁▃▇▇
e0_E7 4149 0.54 2.26 2.49 0 0.00 0.0 5.00 5 ▇▁▁▁▆
e0_E10 4515 0.50 2.77 1.73 0 1.25 2.5 4.38 5 ▃▇▂▅▇
e0_E11 4008 0.55 4.18 1.41 0 3.00 5.0 5.00 5 ▁▂▁▁▇

There are some NAs in the data. These are exclusively in the results from the post version, where Moodle recorded a null response as NA (i.e. cases where students did not give an answer that could be graded).

For this analysis, we replace the NA scores with 0, reflecting a non-correct answer.

test_scores = bind_rows(
  "pre" = test_scores_pre %>%
    replace_names(old_names = test_versions$pre, new_names = test_versions$label),
  "post" = test_scores_post %>%
    mutate(across(matches("^E\\d"), ~ replace_na(., 0))) %>% 
    replace_names(old_names = test_versions$post, new_names = test_versions$label),
  .id = "test_version"
)
Show data summary
skim(test_scores)
Data summary
Name test_scores
Number of rows 8993
Number of columns 28
_______________________
Column type frequency:
character 4
numeric 24
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
test_version 0 1 3 4 0 2 0
year 0 1 7 7 0 9 0
class 8993 0 NA NA 0 0 0
AnonID 0 1 9 9 0 8899 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
e1_E1 0 1.00 4.36 1.50 0 5.00 5.00 5.00 5 ▁▁▁▁▇
e2_E0 5500 0.39 4.23 1.73 0 5.00 5.00 5.00 5 ▁▁▁▁▇
e3_E2 0 1.00 3.53 2.28 0 0.00 5.00 5.00 5 ▃▁▁▁▇
e4_E3 0 1.00 3.94 1.50 0 3.00 5.00 5.00 5 ▁▁▂▂▇
e5_E4 0 1.00 4.03 1.98 0 5.00 5.00 5.00 5 ▂▁▁▁▇
e6_E5 0 1.00 1.69 1.89 0 0.00 2.00 2.00 5 ▇▆▁▁▃
e7_E6 0 1.00 3.68 2.14 0 2.50 5.00 5.00 5 ▃▁▁▁▇
e8_E0 5500 0.39 3.03 2.44 0 0.00 5.00 5.00 5 ▅▁▁▁▇
e9_E8 0 1.00 4.27 1.77 0 5.00 5.00 5.00 5 ▂▁▁▁▇
e10_E9 0 1.00 3.52 1.97 0 2.50 5.00 5.00 5 ▂▁▃▁▇
e11_E0 5500 0.39 3.89 1.67 0 2.50 5.00 5.00 5 ▁▁▂▁▇
e12_E12 0 1.00 4.02 1.90 0 5.00 5.00 5.00 5 ▂▁▁▁▇
e13_E13 0 1.00 3.72 1.95 0 2.00 5.00 5.00 5 ▂▂▁▁▇
e14_E14 0 1.00 2.78 1.99 0 1.00 2.00 5.00 5 ▅▆▁▁▇
e15_E15 0 1.00 3.46 2.09 0 1.25 5.00 5.00 5 ▃▁▂▁▇
e16_E16 0 1.00 4.36 1.67 0 5.00 5.00 5.00 5 ▁▁▁▁▇
e17_E17 0 1.00 4.51 1.48 0 5.00 5.00 5.00 5 ▁▁▁▁▇
e18_E18 0 1.00 3.62 2.24 0 0.00 5.00 5.00 5 ▃▁▁▁▇
e19_E19 0 1.00 2.81 2.48 0 0.00 5.00 5.00 5 ▆▁▁▁▇
e20_E20 0 1.00 2.13 2.47 0 0.00 0.00 5.00 5 ▇▁▁▁▆
Total 0 1.00 69.65 21.55 0 58.50 74.00 85.50 100 ▁▁▃▇▇
e0_E7 3493 0.61 1.99 2.45 0 0.00 0.00 5.00 5 ▇▁▁▁▅
e0_E10 3493 0.61 2.25 1.89 0 0.00 1.88 4.38 5 ▇▆▂▃▇
e0_E11 3493 0.61 3.78 1.81 0 2.00 5.00 5.00 5 ▂▂▁▁▇

1.1 Data cleaning

  1. For students who took the test more than once, consider the attempt with the highest scores only and remove the others;

  2. Eliminate the students who scored three or more zeros in the 5 easiest questions in the second-half of the test; and

  3. Add the students scoring more than 30 marks in total back to the sample.

# keep a copy
test_scores_unfiltered <- test_scores

test_scores <- test_scores_unfiltered %>% 
  group_by(AnonID) %>% 
  slice_max(Total, n = 1) %>% 
  ungroup() %>% 
  rowwise() %>% 
  mutate(invalid_in_easiest_5 = case_when(
    test_version == "pre" ~ sum(e11_E0==0, e12_E12==0, e13_E13==0, e16_E16==0, e17_E17==0),
    test_version == "post" ~ sum(is.na(e0_E11), is.na(e12_E12), is.na(e16_E16), is.na(e17_E17), is.na(e18_E18))
  )
  ) %>% 
  filter(invalid_in_easiest_5 <= 2 | Total >= 30) %>% 
  select(-invalid_in_easiest_5) %>% 
  ungroup()

# test_scores <- test_scores_unfiltered %>%
#   group_by(AnonID) %>%
#   slice_max(Total, n = 1) %>%
#   ungroup() %>%
#   filter(Total > 0)

bind_rows(
  "unfiltered" = test_scores_unfiltered %>% select(Total),
  "filtered" = test_scores %>% select(Total),
  .id = "dataset"
) %>% 
  mutate(dataset = fct_relevel(dataset, "unfiltered", "filtered")) %>% 
  ggplot(aes(x = Total)) +
  geom_histogram(bins = 25) +
  facet_wrap(vars(dataset)) +
  theme_minimal()

bind_rows(
  "unfiltered" = test_scores_unfiltered %>% select(test_version, Total),
  "filtered" = test_scores %>% select(test_version, Total),
  .id = "dataset"
) %>% 
  janitor::tabyl(test_version, dataset) %>% 
  mutate(percent_retained = filtered / unfiltered) %>% 
  select(test_version, unfiltered, filtered, percent_retained) %>% 
  gt() %>% 
  fmt_percent(columns = contains("percent"), decimals = 2)
test_version unfiltered filtered percent_retained
post 5500 5433 98.78%
pre 3493 3214 92.01%

1.2 Data summary

The number of responses from each cohort:

test_scores %>% 
  group_by(year) %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
year n
2013-14 815
2014-15 923
2015-16 719
2016-17 757
2017-18 909
2018-19 1145
2019-20 1216
2020-21 1171
2021-22 992

There is the same (roughly normal) distribution of raw scores each year:

test_scores %>%
  ggplot(aes(x = Total)) +
  geom_histogram(binwidth = 2) +
  facet_wrap(vars(year)) +
  theme_minimal()

Mean and standard deviation for each item (note this table is very wide - see the scroll bar at the bottom!):

test_scores %>% 
  select(-class, -test_version, -Total) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  select(-contains("complete")) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  #fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
2013-14 2014-15 2015-16 2016-17 2017-18 2018-19 2019-20 2020-21 2021-22
n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD n_missing Mean SD
AnonID 0 NA NA 0 NA NA 0 NA NA 0 NA NA 0 NA NA 0 NA NA 0 NA NA 0 NA NA 0 NA NA
e1_E1 0 4.32 1.41 0 4.40 1.32 0 4.45 1.29 0 4.51 1.16 0 4.44 1.50 0 4.37 1.54 0 4.51 1.40 0 4.49 1.41 0 4.36 1.59
e2_E0 0 4.31 1.65 0 4.41 1.53 0 4.49 1.45 0 4.54 1.37 909 NaN NA 1145 NaN NA 1216 NaN NA 1171 NaN NA 992 NaN NA
e3_E2 0 3.33 2.36 0 3.24 2.39 0 3.39 2.34 0 3.61 2.24 0 3.60 2.24 0 3.66 2.21 0 3.87 2.10 0 3.80 2.14 0 3.83 2.12
e4_E3 0 3.72 1.43 0 3.83 1.33 0 3.91 1.38 0 4.12 1.22 0 4.07 1.44 0 4.06 1.43 0 4.09 1.46 0 4.17 1.40 0 4.21 1.40
e5_E4 0 4.05 1.96 0 4.19 1.84 0 4.28 1.76 0 4.33 1.70 0 3.95 2.04 0 4.05 1.96 0 4.10 1.93 0 4.21 1.82 0 4.05 1.96
e6_E5 0 1.47 1.84 0 1.52 1.85 0 1.81 1.95 0 2.04 1.97 0 1.75 1.82 0 1.66 1.83 0 1.77 1.87 0 1.83 1.91 0 1.85 1.96
e7_E6 0 3.57 2.17 0 3.47 2.22 0 3.69 2.12 0 4.15 1.83 0 3.80 2.08 0 3.81 2.06 0 3.93 2.00 0 3.92 1.99 0 3.69 2.14
e8_E0 0 2.97 2.46 0 3.04 2.44 0 3.32 2.36 0 3.71 2.19 909 NaN NA 1145 NaN NA 1216 NaN NA 1171 NaN NA 992 NaN NA
e9_E8 0 4.31 1.73 0 4.46 1.55 0 4.44 1.57 0 4.66 1.27 0 4.22 1.81 0 4.38 1.64 0 4.39 1.64 0 4.41 1.62 0 4.33 1.70
e10_E9 0 3.19 2.03 0 3.29 2.00 0 3.34 1.97 0 3.62 1.88 0 3.65 1.85 0 3.65 1.89 0 3.78 1.86 0 3.92 1.78 0 3.91 1.79
e11_E0 0 4.05 1.45 0 4.16 1.40 0 4.13 1.42 0 4.26 1.34 909 NaN NA 1145 NaN NA 1216 NaN NA 1171 NaN NA 992 NaN NA
e12_E12 0 3.99 1.90 0 3.93 1.95 0 4.10 1.81 0 4.29 1.65 0 4.16 1.79 0 4.15 1.79 0 4.26 1.71 0 4.25 1.72 0 4.20 1.76
e13_E13 0 3.69 1.95 0 3.68 1.94 0 3.82 1.82 0 4.07 1.71 0 3.72 1.91 0 3.83 1.88 0 3.92 1.83 0 3.96 1.80 0 3.85 1.88
e14_E14 0 2.44 1.93 0 2.54 1.93 0 2.85 1.92 0 3.03 1.90 0 2.85 1.95 0 2.88 1.93 0 3.05 1.94 0 3.02 1.96 0 3.05 2.01
e15_E15 0 3.57 2.00 0 3.51 2.02 0 3.67 1.95 0 3.69 2.02 0 3.54 2.05 0 3.50 2.05 0 3.46 2.10 0 3.73 1.97 0 3.57 2.08
e16_E16 0 4.51 1.49 0 4.49 1.52 0 4.55 1.43 0 4.67 1.24 0 4.45 1.57 0 4.41 1.61 0 4.43 1.59 0 4.47 1.54 0 4.59 1.37
e17_E17 0 4.69 1.21 0 4.59 1.38 0 4.74 1.12 0 4.76 1.08 0 4.65 1.28 0 4.59 1.37 0 4.65 1.28 0 4.64 1.30 0 4.65 1.27
e18_E18 0 3.48 2.30 0 3.45 2.31 0 3.65 2.22 0 3.84 2.11 0 3.72 2.18 0 3.66 2.21 0 3.88 2.08 0 3.89 2.08 0 3.96 2.03
e19_E19 0 2.28 2.49 0 2.26 2.49 0 2.60 2.50 0 3.01 2.45 0 2.89 2.47 0 3.07 2.44 0 3.17 2.41 0 3.25 2.39 0 3.26 2.38
e20_E20 0 1.82 2.41 0 1.81 2.41 0 1.99 2.45 0 2.46 2.50 0 2.21 2.48 0 2.29 2.49 0 2.37 2.50 0 2.25 2.49 0 2.43 2.50
e0_E7 815 NaN NA 923 NaN NA 719 NaN NA 757 NaN NA 0 1.90 2.43 0 1.91 2.43 0 2.02 2.45 0 2.14 2.48 0 2.03 2.46
e0_E10 815 NaN NA 923 NaN NA 719 NaN NA 757 NaN NA 0 2.01 1.84 0 2.07 1.84 0 2.36 1.89 0 2.48 1.91 0 2.38 1.93
e0_E11 815 NaN NA 923 NaN NA 719 NaN NA 757 NaN NA 0 3.77 1.82 0 3.78 1.79 0 3.88 1.75 0 3.85 1.79 0 3.77 1.84

2 Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Since the combined data on the two versions of the test have large portions of “missing” data (e.g. no responses to new questions from students who completed the old test), it is not possible to carry out the factor analysis as in the analyse-test script, since the factor analysis routine does not work with missing data.

Instead, in the next section we proceed directly to fitting the IRT model, and using the \(Q_3\) check for local independence. In the final section, we also run a factor analysis for the data from the new test only.

3 Fitting 2 parameter logistic MIRT model

The mirt implementation of the graded partial credit model (gpmc) requires that the partial marks are consecutive integers. We therefore need to work around this by adjusting our scores into that form (e.g. replacing scores of 0, 2.5, 5 with 1, 2, 3), while keeping track of the true scores attached to each mark level so that we can properly compute expected scores later on.

# save just the matrix of scores
item_scores <- test_scores %>% 
  select(matches("^e\\d"))

# Determine the mark levels for each item
mark_levels <- item_scores %>% 
  pivot_longer(everything(), names_to = "item", values_to = "score") %>% 
  distinct() %>% 
  # we don't want to give a level to NA
  filter(!is.na(score)) %>% 
  arrange(parse_number(item), score) %>% 
  group_by(item) %>%
  mutate(order = row_number()) %>% 
# Note that the convention used by mirt is for items that have only 2 levels (i.e. 0 marks or full marks),
# the columns are P.0 and P.1, while other items are indexed from 1, i.e. P.1, P.2, ...
# https://github.com/philchalmers/mirt/blob/accd2383b9a4d17a4cab269717ce98434900b62c/R/probtrace.R#L57
  mutate(level = case_when(
    max(order) == 2 ~ order - 1,
    TRUE ~ order * 1.0
  )) %>% 
  mutate(levelname = paste0(item, ".P.", level))

# Use the mark_levels table to replace scores with levels
# (first pivot the data to long form, make the replacement, then pivot back to wide again)
item_scores_levelled <- item_scores %>% 
  # temporarily add row identifiers
  mutate(row = row_number()) %>% 
  pivot_longer(cols = -row, names_to = "item", values_to = "score") %>% 
  left_join(mark_levels %>% select(item, score, level), by = c("item", "score")) %>% 
  select(-score) %>% 
  pivot_wider(names_from = "item", values_from = "level") %>% 
  select(-row)
Show model fitting output
irt_fit <- mirt(
  data = item_scores_levelled, # just the columns with question scores
  model = 1,          # number of factors to extract
  itemtype = "gpcm",  # generalised partial credit model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -140201.896, Max-Change: 2.44104
Iteration: 2, Log-Lik: -125840.511, Max-Change: 0.51042
Iteration: 3, Log-Lik: -124316.163, Max-Change: 0.48192
Iteration: 4, Log-Lik: -124054.077, Max-Change: 0.18441
Iteration: 5, Log-Lik: -123928.237, Max-Change: 0.13866
Iteration: 6, Log-Lik: -123873.691, Max-Change: 0.12047
Iteration: 7, Log-Lik: -123840.232, Max-Change: 0.07726
Iteration: 8, Log-Lik: -123829.225, Max-Change: 0.06291
Iteration: 9, Log-Lik: -123822.166, Max-Change: 0.02540
Iteration: 10, Log-Lik: -123817.865, Max-Change: 0.03143
Iteration: 11, Log-Lik: -123815.728, Max-Change: 0.01224
Iteration: 12, Log-Lik: -123814.106, Max-Change: 0.03540
Iteration: 13, Log-Lik: -123812.966, Max-Change: 0.00751
Iteration: 14, Log-Lik: -123812.611, Max-Change: 0.00306
Iteration: 15, Log-Lik: -123812.455, Max-Change: 0.00220
Iteration: 16, Log-Lik: -123812.314, Max-Change: 0.00439
Iteration: 17, Log-Lik: -123812.238, Max-Change: 0.00048
Iteration: 18, Log-Lik: -123812.233, Max-Change: 0.00158
Iteration: 19, Log-Lik: -123812.206, Max-Change: 0.00065
Iteration: 20, Log-Lik: -123812.203, Max-Change: 0.00734
Iteration: 21, Log-Lik: -123812.159, Max-Change: 0.00062
Iteration: 22, Log-Lik: -123812.158, Max-Change: 0.00062
Iteration: 23, Log-Lik: -123812.149, Max-Change: 0.00007
## 
## Calculating information matrix...

3.1 Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

irt_residuals  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only one showing cause for concern:

irt_residuals %>%
  rownames_to_column(var = "item1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("e"), names_to = "item2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(item1) < parse_number(item2)) %>%
  gt()
item1 item2 Q3_score
e16_E16 e17_E17 0.2742701

This pair of questions on variations of the chain rule was also identified by the analysis of the first version of the test.

Given that this violation of the local independence assumption is very mild, we proceed using this model.

3.2 Model parameters

We then compute factor score estimates and augment the existing data frame with these estimates, to keep everything in one place. To do the estimation, we use the fscores() function from the mirt package which takes in a computed model object and computes factor score estimates according to the method specified. We will use the EAP method for factor score estimation, which is the “expected a-posteriori” method, the default.

test_scores <- test_scores %>%
  mutate(F1 = fscores(irt_fit, method = "EAP"))

We can also calculate the model coefficient estimates using a generic function coef() which is used to extract model coefficients from objects returned by modeling functions. We will set the IRTpars argument to TRUE, which means slope intercept parameters will be converted into traditional IRT parameters.

irt_coefs <- coef(irt_fit, IRTpars = TRUE)

The resulting object coefs is a list, with one element for each question, and an additional GroupPars element that we won’t be using. For each question, the object records several values:

  • a is discrimination
  • b is difficulty
  • endpoints of the 95% confidence intervals are also shown

To make this output a little more user friendly, we use the tidy_mirt_coefs function that we have provided to produce a single data frame with a row for each question.

source("fn_tidy_mirt_coefs.R")

tidy_irt_coefs <- tidy_mirt_coefs(irt_coefs)

Here is a nicely formatted table of the result:

tidy_irt_coefs_with_cis <- tidy_irt_coefs %>% 
  mutate(ci = str_glue("[{round(CI_2.5, 3)}, {round(CI_97.5, 3)}]")) %>% 
  select(-starts_with("CI_"))

tidy_irt_coefs_with_cis %>% 
  filter(par == "a") %>% 
  select(-par) %>% 
  rename_with(.fn = ~ paste0("a_", .x), .cols = -Question) %>% 
  left_join(
    tidy_irt_coefs_with_cis %>% 
      filter(str_detect(par, "^b")),
    by = "Question"
  ) %>%
  gt(groupname_col = "Question") %>%
  fmt_number(columns = contains("est"), decimals = 3) %>%
  data_color(
    columns = contains("a_est"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = c("est"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = c("par", "est", "ci")) %>% 
  cols_label(
    a_est = "Est.",
    a_ci = "95% CI",
    par = "Parameter",
    est = "Est.",
    ci = "95% CI"
  )
Discrimination Difficulty
Est. 95% CI Parameter Est. 95% CI
e1_E1
0.590 [0.541, 0.64] b1 −0.876 [-1.064, -0.689]
0.590 [0.541, 0.64] b2 −4.079 [-4.428, -3.73]
e2_E0
0.399 [0.319, 0.479] b1 1.772 [1.028, 2.515]
0.399 [0.319, 0.479] b2 −8.020 [-9.636, -6.403]
e3_E2
1.321 [1.238, 1.404] b1 −0.953 [-1.013, -0.894]
e4_E3
0.231 [0.217, 0.245] b1 17.935 [13.263, 22.606]
0.231 [0.217, 0.245] b2 −20.086 [-24.742, -15.43]
0.231 [0.217, 0.245] b3 2.305 [1.366, 3.245]
0.231 [0.217, 0.245] b4 4.467 [2.831, 6.103]
0.231 [0.217, 0.245] b5 −11.709 [-13.305, -10.113]
0.231 [0.217, 0.245] b6 0.709 [0.085, 1.333]
0.231 [0.217, 0.245] b7 −3.766 [-4.362, -3.171]
0.231 [0.217, 0.245] b8 5.000 [4.247, 5.752]
0.231 [0.217, 0.245] b9 −2.361 [-3.126, -1.596]
0.231 [0.217, 0.245] b10 −3.494 [-4.088, -2.899]
0.231 [0.217, 0.245] b11 8.825 [7.781, 9.87]
0.231 [0.217, 0.245] b12 −16.742 [-18.07, -15.414]
e5_E4
1.197 [1.112, 1.282] b1 −1.617 [-1.713, -1.522]
e6_E5
0.423 [0.394, 0.452] b1 0.438 [0.312, 0.565]
0.423 [0.394, 0.452] b2 9.752 [8.833, 10.671]
0.423 [0.394, 0.452] b3 −7.838 [-8.724, -6.951]
e7_E6
0.780 [0.732, 0.828] b1 1.430 [1.229, 1.63]
0.780 [0.732, 0.828] b2 −3.571 [-3.826, -3.317]
e8_E0
0.555 [0.457, 0.652] b1 −1.272 [-1.499, -1.044]
e9_E8
1.522 [1.413, 1.632] b1 −1.764 [-1.855, -1.672]
e10_E9
0.758 [0.71, 0.805] b1 −0.915 [-1.012, -0.818]
0.758 [0.71, 0.805] b2 −1.338 [-1.451, -1.224]
e11_E0
0.310 [0.264, 0.355] b1 −1.823 [-2.661, -0.984]
0.310 [0.264, 0.355] b2 −5.607 [-6.52, -4.694]
0.310 [0.264, 0.355] b3 8.803 [7.095, 10.511]
0.310 [0.264, 0.355] b4 −13.112 [-15.278, -10.946]
e12_E12
0.537 [0.501, 0.572] b1 1.525 [1.229, 1.821]
0.537 [0.501, 0.572] b2 −0.655 [-0.936, -0.374]
0.537 [0.501, 0.572] b3 −5.447 [-5.855, -5.039]
e13_E13
0.546 [0.513, 0.58] b1 −0.852 [-1.002, -0.701]
0.546 [0.513, 0.58] b2 3.682 [3.274, 4.09]
0.546 [0.513, 0.58] b3 −6.768 [-7.289, -6.248]
e14_E14
0.560 [0.529, 0.59] b1 1.473 [1.242, 1.704]
0.560 [0.529, 0.59] b2 −3.709 [-3.958, -3.46]
0.560 [0.529, 0.59] b3 8.496 [7.624, 9.367]
0.560 [0.529, 0.59] b4 −3.958 [-4.758, -3.157]
0.560 [0.529, 0.59] b5 −4.350 [-4.709, -3.99]
e15_E15
0.230 [0.212, 0.248] b1 9.675 [8.618, 10.733]
0.230 [0.212, 0.248] b2 −7.609 [-8.506, -6.712]
0.230 [0.212, 0.248] b3 4.145 [3.518, 4.772]
0.230 [0.212, 0.248] b4 −11.730 [-12.764, -10.695]
e16_E16
1.612 [1.492, 1.733] b1 −1.878 [-1.974, -1.782]
e17_E17
1.895 [1.742, 2.049] b1 −2.022 [-2.121, -1.922]
e18_E18
1.103 [1.028, 1.178] b1 −1.206 [-1.284, -1.128]
e19_E19
1.631 [1.538, 1.724] b1 −0.287 [-0.326, -0.248]
e20_E20
1.880 [1.773, 1.987] b1 0.206 [0.17, 0.242]
e0_E7
1.028 [0.943, 1.112] b1 0.548 [0.48, 0.616]
e0_E10
0.398 [0.371, 0.424] b1 4.976 [4.406, 5.545]
0.398 [0.371, 0.424] b2 −4.982 [-5.508, -4.457]
0.398 [0.371, 0.424] b3 2.278 [1.933, 2.623]
0.398 [0.371, 0.424] b4 −0.263 [-0.618, 0.092]
0.398 [0.371, 0.424] b5 1.044 [0.669, 1.419]
0.398 [0.371, 0.424] b6 −0.380 [-0.754, -0.006]
0.398 [0.371, 0.424] b7 1.900 [1.508, 2.292]
0.398 [0.371, 0.424] b8 −2.689 [-3.11, -2.267]
e0_E11
0.393 [0.365, 0.421] b1 2.303 [1.774, 2.832]
0.393 [0.365, 0.421] b2 −5.366 [-5.908, -4.825]
0.393 [0.365, 0.421] b3 9.080 [7.856, 10.303]
0.393 [0.365, 0.421] b4 −3.556 [-4.702, -2.41]
0.393 [0.365, 0.421] b5 −8.877 [-9.709, -8.044]

These values are also saved to the file output/uoe_pre-vs-post_gpcm-results.csv.

tidy_irt_coefs %>% 
  write_csv("output/uoe_pre-vs-post_gpcm-results.csv")

3.3 Information curves

theta <- seq(-6, 6, by=0.05)

info_matrix <- testinfo(irt_fit, theta, individual = TRUE)
colnames(info_matrix) <- test_versions %>% pull(label)
item_info_data <- info_matrix %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "item", values_to = "info_y") %>% 
  left_join(test_versions %>% select(item = label, MATH_group), by = "item") %>% 
  mutate(item = fct_reorder(item, parse_number(item)))

3.3.1 Item information curves

Breaking this down by question helps to highlight those questions that are most/least informative:

item_info_data %>% 
  ggplot(aes(x = theta, y = info_y, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Information") +
  theme_minimal()

We can also compute the sums of different subsets of the information curves – here, we will look at the questions based on their MATH group:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0)),
    items_C = sum(ifelse(MATH_group == "C", info_y, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B", "C")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Information") +
  theme_minimal()

ggsave("output/uoe_pre-vs-post_info-curves_A-vs-B.pdf", units = "cm", width = 14, height = 6)

This shows that the information in the MATH Group B questions is at a higher point on the ability scale than for the MATH Group A questions.

Since the number of items in each case is different, we consider instead the mean information per item:

item_info_data %>% 
  group_by(theta) %>% 
  summarise(
    items_all = sum(info_y) / n(),
    items_A = sum(ifelse(MATH_group == "A", info_y, 0)) / sum(ifelse(MATH_group == "A", 1, 0)),
    items_B = sum(ifelse(MATH_group == "B", info_y, 0)) / sum(ifelse(MATH_group == "B", 1, 0)),
    items_C = sum(ifelse(MATH_group == "C", info_y, 0)) / sum(ifelse(MATH_group == "C", 1, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y") %>% 
  mutate(items = fct_relevel(items, "all", "A", "B", "C")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items)) +
  geom_line() +
  scale_colour_manual(values = c("all" = "#000000", MATH_colours)) +
  labs(x = "Ability", y = "Mean information per item") +
  theme_minimal()

ggsave("output/uoe_pre-vs-post_info-curves_A-vs-B-avg.pdf", units = "cm", width = 10, height = 6)

This shows that items of each MATH group are giving broadly similar levels of information on average, but at different points on the ability scale.

3.4 Total information

Using mirt’s areainfo function, we can find the total area under the information curves.

info_gpcm <- areainfo(irt_fit, c(-4,4))
info_gpcm %>% gt()
LowerBound UpperBound Info TotalInfo Proportion nitems
-4 4 33.70541 36.09974 0.9336745 23

This shows that the total information in all items is 36.0997418.

3.4.1 Information by item

tidy_info <- test_versions %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(irt_fit,
               c(-4, 4),
               which.items = .x) %>% pull(TotalInfo)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  sub_missing(columns = one_of("pre", "post"), missing_text = "") %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
pre post MATH_group description label outcome Information
E10 B 2x2 system possibilities e0_E10 added 3.1772358
e14 E14 B find minimum gradient of cubic e14_E14 unchanged 2.7973270
e4 E3 A completing the square e4_E3 unchanged 2.7612246
E11 C using max and min functions e0_E11 added 1.9590963
e17 E17 A chain rule e17_E17 unchanged 1.8952920
e20 E20 B product rule with given values e20_E20 unchanged 1.8798335
e13 E13 A equation of tangent e13_E13 unchanged 1.6352955
e19 E19 B area between curve and x-axis (in 2 parts) e19_E19 unchanged 1.6307232
e16 E16 A trig chain rule e16_E16 unchanged 1.6124480
e12 E12 A find point with given slope e12_E12 unchanged 1.6085723
e7 E6 B graphical vector sum e7_E6 unchanged 1.5598502
e9 E8 A simplify logs e9_E8 unchanged 1.5223517
e10 E9 B identify graph of rational functions e10_E9 unchanged 1.5143576
e3 E2 B composition of functions e3_E2 unchanged 1.3212157
e6 E5 A trig wave function e6_E5 unchanged 1.2639778
e11 A summing arithmetic progression e11_E0 removed 1.2033969
e5 E4 A trig double angle formula e5_E4 unchanged 1.1972207
e1 E1 A properties of fractions e1_E1 unchanged 1.1777765
e18 E18 A definite integral e18_E18 unchanged 1.1033115
E7 B angle between 3d vectors (in context) e0_E7 added 1.0275643
e15 E15 A find and classify stationary points of cubic e15_E15 unchanged 0.9123828
e2 A find intersection of lines e2_E0 removed 0.7900902
e8 A compute angle between 3d vectors e8_E0 removed 0.5491977

Restricting instead to the range \(-2\leq\theta\leq2\):

tidy_info <- test_versions %>%
  mutate(item_num = row_number()) %>% 
  mutate(TotalInfo = purrr::map_dbl(
    item_num,
    ~ areainfo(irt_fit,
               c(-2, 2),
               which.items = .x) %>% pull(Info)
  ))

tidy_info %>%
  select(-item_num) %>% 
  arrange(-TotalInfo) %>% 
  #group_by(outcome) %>% 
  gt() %>% 
  sub_missing(columns = one_of("pre", "post"), missing_text = "") %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("info"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("outcome"),
    colors = scales::col_factor(palette = c("viridis"), domain = NULL)
  ) %>%
  cols_label(
    TotalInfo = "Information"
  )
pre post MATH_group description label outcome Information
E10 B 2x2 system possibilities e0_E10 added 2.6894906
e14 E14 B find minimum gradient of cubic e14_E14 unchanged 2.3637938
e20 E20 B product rule with given values e20_E20 unchanged 1.7882056
e4 E3 A completing the square e4_E3 unchanged 1.7315578
e19 E19 B area between curve and x-axis (in 2 parts) e19_E19 unchanged 1.4984133
E11 C using max and min functions e0_E11 added 1.3954500
e7 E6 B graphical vector sum e7_E6 unchanged 1.2153004
e13 E13 A equation of tangent e13_E13 unchanged 1.1544884
e12 E12 A find point with given slope e12_E12 unchanged 1.0519507
e3 E2 B composition of functions e3_E2 unchanged 1.0300617
e10 E9 B identify graph of rational functions e10_E9 unchanged 1.0158973
e17 E17 A chain rule e17_E17 unchanged 0.9271541
e9 E8 A simplify logs e9_E8 unchanged 0.8916069
e16 E16 A trig chain rule e16_E16 unchanged 0.8821230
E7 B angle between 3d vectors (in context) e0_E7 added 0.7691369
e6 E5 A trig wave function e6_E5 unchanged 0.7506395
e18 E18 A definite integral e18_E18 unchanged 0.7477240
e5 E4 A trig double angle formula e5_E4 unchanged 0.7178985
e15 E15 A find and classify stationary points of cubic e15_E15 unchanged 0.4887409
e1 E1 A properties of fractions e1_E1 unchanged 0.4321422
e11 A summing arithmetic progression e11_E0 removed 0.4304909
e8 A compute angle between 3d vectors e8_E0 removed 0.2548706
e2 A find intersection of lines e2_E0 removed 0.2211928

3.4.2 Evaluating the changes

info_comparison_data <- item_info_data %>% 
  mutate(item = as.character(item)) %>% 
  left_join(test_versions %>% mutate(item = as.character(label)), by = "item") %>% 
  group_by(theta) %>% 
  summarise(
    items_pre = sum(ifelse(outcome %in% c("unchanged", "removed"), info_y, 0)),
    items_post = sum(ifelse(outcome %in% c("unchanged", "added"), info_y, 0)),
    items_added = sum(ifelse(outcome %in% c("added"), info_y, 0)),
    items_removed = sum(ifelse(outcome %in% c("removed"), info_y, 0))
  ) %>% 
  pivot_longer(cols = starts_with("items_"), names_to = "items", names_prefix = "items_", values_to = "info_y")
test_info_total_plot <- info_comparison_data %>% 
  filter(items %in% c("pre", "post")) %>% 
  mutate(items = case_when(items == "pre" ~ "Version 1", items == "post" ~ "Version 2")) %>% 
  #mutate(items = fct_relevel(items, "pre", "post")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items, linetype = items)) +
  geom_line() +
  scale_colour_brewer("Edinburgh MDT", palette = "Set1") +
  scale_linetype_manual("Edinburgh MDT", values = c("dashed", "solid")) +
  labs(x = "Ability", y = "Information") +
  theme_minimal() +
  theme(legend.position = "top",
        legend.title = element_text(face = "bold"))

#test_info_total_plot
ggsave("output/uoe_pre-vs-post_info.pdf", units = "cm", width = 8, height = 8)
test_info_changes_plot <- info_comparison_data %>% 
  filter(!items %in% c("pre", "post")) %>% 
  ggplot(aes(x = theta, y = info_y, colour = items, linetype = items)) +
  geom_line() +
  scale_colour_brewer("Questions", palette = "Paired", direction = -1) +
  scale_linetype_manual("Questions", values = c("solid", "dashed")) +
  labs(x = "Ability", y = "Information") +
  theme_minimal() +
  theme(legend.position = "top")

#test_info_changes_plot
ggsave("output/uoe_pre-vs-post_changes.pdf", units = "cm", width = 8, height = 8)
test_info_total_plot + test_info_changes_plot

ggsave("output/uoe_pre-vs-post_info-summary.pdf", units = "cm", width = 16, height = 8)

The changes have increased the total information in the test:

info_old <- areainfo(irt_fit,
                     c(-4, 4),
                     which.items = test_versions %>% filter(outcome %in% c("unchanged", "removed")) %>% pull(item_num))
info_new <- areainfo(irt_fit,
                     c(-4, 4),
                     which.items = test_versions %>% filter(outcome %in% c("unchanged", "added")) %>% pull(item_num))

versions_info <- bind_rows("Version 1" = info_old,
                           "Version 2" = info_new,
                           .id = "version") %>% 
  select(-LowerBound, -UpperBound, -Info, -Proportion) %>% 
  mutate(mean = TotalInfo / nitems)

versions_info %>% gt() %>%
  cols_label(
    version = "Test version",
    TotalInfo = "Total information",
    nitems = "Number of items",
    mean = "Mean information per item"
  )
Test version Total information Number of items Mean information per item
Version 1 29.93585 20 1.496792
Version 2 33.55706 20 1.677853

3.5 Response curves

Since the gpcm model is more complicated, there is a characteristic curve for each possible score on the question:

trace_data <- probtrace(irt_fit, theta) %>% 
  as_tibble() %>% 
  bind_cols(theta = theta) %>% 
  pivot_longer(cols = -theta, names_to = "level", values_to = "y") %>% 
  left_join(mark_levels %>% select(item, level = levelname, score), by = "level") %>% 
  mutate(score = as.factor(score))

trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  ggplot(aes(x = theta, y = y, colour = score)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Probability of response") +
  theme_minimal()

To get a simplified picture for each question, we compute the expected score at each ability level:

expected_scores <- trace_data %>% 
  mutate(item = fct_reorder(item, parse_number(item))) %>% 
  group_by(item, theta) %>% 
  summarise(expected_score = sum(as.double(as.character(score)) * y), .groups = "drop")

expected_scores %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item)) +
  geom_line() +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  facet_wrap(vars(item)) +
  labs(y = "Expected score") +
  theme_minimal()

The resulting curves look quite similar to those from the 2PL, allowing for some similar interpretation. For instance, superimposing all the curves shows that there is a spread of difficulties (i.e. thetas where the expected score is 2.5/5) and that some questions are more discriminating than others (i.e. steeper slopes):

plt <- expected_scores %>% 
  left_join(test_versions %>% mutate(item = as.factor(label)), by = "item") %>% 
  mutate(item_removed = (outcome == "removed")) %>% 
  ggplot(aes(x = theta, y = expected_score, colour = item, text = item)) +
  geom_line(aes(linetype = outcome)) +
  scale_colour_viridis_d(option = "plasma", end = 0.8, direction = -1) +
  scale_linetype_manual("outcome", values = c("unchanged" = "solid", "removed" = "dashed", "added" = "twodash")) +
  labs(y = "Expected score") +
  theme_minimal()

ggplotly(plt, tooltip = "text")
ggsave(plot = plt, file = "output/uoe_pre-vs-post_iccs-superimposed.pdf", width = 20, height = 14, units = "cm")

3.5.1 Test response curve

total_expected_score <- expected_scores %>% 
  group_by(theta) %>% 
  summarise(
    expected_score_pre = sum(ifelse(!str_detect(item, "e0"), expected_score, 0)),
    expected_score_post = sum(ifelse(!str_detect(item, "E0"), expected_score, 0))
  ) %>% 
  pivot_longer(cols = starts_with("expected_score_"), names_prefix = "expected_score_", names_to = "test_version", values_to = "expected_score")

total_expected_score %>% 
  ggplot(aes(x = theta, y = expected_score, colour = test_version)) +
  geom_line() +
   geom_point(data = total_expected_score %>% filter(theta == 0)) +
   ggrepel::geom_label_repel(data = total_expected_score %>% filter(theta == 0), aes(label = str_glue("{test_version}: {round(expected_score, 1)}")), box.padding = 0.5) +
  scale_colour_viridis_d(option = "plasma", end = 0.7, guide = "none") +
  labs(y = "Expected score") +
  theme_minimal()

4 Factor analysis for the new test only

4.1 Factor analysis setup

Here we redo the factor analysis, but using only the data from the new version of the test.

item_scores_B <- test_scores %>% 
  select(-F1) %>% 
  select(-contains("E0")) %>% 
  filter(test_version == "post") %>% 
  # select only the columns with question scores (names like ex_Ex)
  select(matches("e\\d+_E\\d+"))

The parameters package provides functions that run various checks to see if the data is suitable for factor analysis, and if so, how many factors should be retained.

structure <- check_factorstructure(item_scores_B)
n <- n_factors(item_scores_B)

4.1.0.1 Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.93).
  • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(136) = 17111.17, p < .001).

4.1.0.2 Method Agreement Procedure:

The choice of 1 dimensions is supported by 7 (36.84%) methods out of 19 (t, p, Acceleration factor, Scree (SE), Scree (R2), VSS complexity 1, Velicer’s MAP).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 7
2 4
3 1
4 2
5 1
14 1
15 3
#n %>% tibble() %>% gt()

The scree plot shows the eignvalues associated with each factor:

fa.parallel(item_scores_B, fa = "fa")

## Parallel analysis suggests that the number of factors =  5  and the number of components =  NA

Based on this, there is clear support for a 1-factor solution. We also consider the 2-factor solution.

4.2 1 Factor

Show factanal output
fitfact <- factanal(item_scores_B, factors = 1, rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores_B, factors = 1, rotation = "varimax")
## 
## Uniquenesses:
##   e1_E1   e3_E2   e4_E3   e5_E4   e6_E5   e7_E6   e9_E8  e10_E9 e12_E12 e13_E13 
##    0.89    0.78    0.68    0.80    0.86    0.74    0.74    0.76    0.73    0.69 
## e14_E14 e15_E15 e16_E16 e17_E17 e18_E18 e19_E19 e20_E20 
##    0.62    0.86    0.77    0.74    0.79    0.69    0.74 
## 
## Loadings:
##  [1] 0.57 0.51 0.51 0.52 0.55 0.62 0.51 0.56 0.51 0.33 0.47 0.45 0.37 0.49 0.37
## [16] 0.48 0.45
## 
##                Factor1
## SS loadings       4.11
## Proportion Var    0.24
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 1429.84 on 119 degrees of freedom.
## The p-value is 1.86e-223
load <- tidy(fitfact)
ggplot(load, aes(x = fl1, y = 0)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("E", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = NULL,
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(test_versions %>% select(question = label, description, MATH_group), by = "question") %>% 
  arrange(-factor_loading) %>% 
  gt() %>%
  data_color(
    columns = contains("factor"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("MATH"),
    colors = MATH_colours
  )
question factor_loading description MATH_group
e14_E14 0.6168282 find minimum gradient of cubic B
e4_E3 0.5680372 completing the square A
e19_E19 0.5552499 area between curve and x-axis (in 2 parts) B
e13_E13 0.5539675 equation of tangent A
e12_E12 0.5186014 find point with given slope A
e7_E6 0.5128365 graphical vector sum B
e20_E20 0.5118532 product rule with given values B
e9_E8 0.5083549 simplify logs A
e17_E17 0.5075761 chain rule A
e10_E9 0.4862613 identify graph of rational functions B
e16_E16 0.4820832 trig chain rule A
e3_E2 0.4724702 composition of functions B
e18_E18 0.4541077 definite integral A
e5_E4 0.4506912 trig double angle formula A
e15_E15 0.3715183 find and classify stationary points of cubic A
e6_E5 0.3701453 trig wave function A
e1_E1 0.3301510 properties of fractions A

The questions that load most strongly on this factor are again dominated by the MATH Group B questions.

The new “in context” question, e0_E7, is disappointingly low on the factor loading (and on information, shown above). Perhaps the context is sufficiently routine for these students.

4.3 2 Factor

Here we also investigate the 2-factor solution, to see whether these factors are interpretable.

Show factanal output
fitfact2 <- factanal(item_scores_B, factors = 2, rotation = "varimax")
print(fitfact2, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores_B, factors = 2, rotation = "varimax")
## 
## Uniquenesses:
##   e1_E1   e3_E2   e4_E3   e5_E4   e6_E5   e7_E6   e9_E8  e10_E9 e12_E12 e13_E13 
##    0.89    0.76    0.67    0.80    0.85    0.72    0.74    0.77    0.73    0.69 
## e14_E14 e15_E15 e16_E16 e17_E17 e18_E18 e19_E19 e20_E20 
##    0.59    0.85    0.59    0.45    0.78    0.69    0.72 
## 
## Loadings:
##         Factor1 Factor2
## e4_E3   0.54           
## e7_E6   0.50           
## e14_E14 0.60           
## e19_E19 0.51           
## e16_E16         0.61   
## e17_E17         0.72   
## e1_E1                  
## e3_E2   0.47           
## e5_E4   0.40           
## e6_E5   0.37           
## e9_E8   0.40    0.30   
## e10_E9  0.42           
## e12_E12 0.43           
## e13_E13 0.50           
## e15_E15                
## e18_E18 0.31    0.36   
## e20_E20 0.49           
## 
##                Factor1 Factor2
## SS loadings       3.02    1.68
## Proportion Var    0.18    0.10
## Cumulative Var    0.18    0.28
## 
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 544.55 on 103 degrees of freedom.
## The p-value is 2.97e-61
load2 <- tidy(fitfact2)
load2_plot <- load2 %>%
  rename(question = variable) %>% 
  left_join(test_versions %>% rename(question = label), by = "question") %>%
  ggplot(aes(x = fl1, y = fl2, colour = MATH_group, shape = MATH_group)) +
  geom_point() +
  geom_text_repel(aes(label = question), show.legend = FALSE, alpha = 0.6) +
  labs(
    x = "Factor 1 (of 2)",
    y = "Factor 2 (of 2)"
  ) +
  scale_colour_manual("MATH group", values = MATH_colours) +
  scale_shape_manual(name = "MATH group", values = c(19, 17, 18)) +
  theme_minimal()


load2_plot +
  labs(
    title = "Standardised Loadings",
    subtitle = "Showing the 2-factor model"
  )

ggsave("output/uoe_pre-vs-post_2factor.pdf", units = "cm", width = 12, height = 8, dpi = 300,
       plot = load2_plot)
main_factors <- load2 %>% 
#  mutate(factorNone = 0.4) %>%  # add this to set the main factor to "None" where all loadings are below 0.4
  pivot_longer(names_to = "factor",
               cols = contains("fl")) %>% 
  mutate(value_abs = abs(value)) %>% 
  group_by(variable) %>% 
  top_n(1, value_abs) %>% 
  ungroup() %>% 
  transmute(main_factor = factor, variable)

load2 %>% 
  select(-uniqueness) %>% 
  # add the info about which is the main factor
  left_join(main_factors, by = "variable") %>%
  left_join(test_versions %>% select(variable = label, description, MATH_group), by = "variable") %>% 
  arrange(main_factor) %>% 
  select(main_factor, everything()) %>% 
  # arrange adjectives by descending loading on main factor
  rowwise() %>% 
  mutate(max_loading = max(abs(c_across(starts_with("fl"))))) %>% 
  group_by(main_factor) %>% 
  arrange(-max_loading, .by_group = TRUE) %>% 
  select(-max_loading) %>% 
  # sort out the presentation
  rename("Main Factor" = main_factor, # the _ throws a latex error
         "Question" = variable) %>%
  mutate_at(
    vars(starts_with("fl")),
    ~ cell_spec(round(., digits = 3), bold = if_else(abs(.) > 0.4, T, F))
  ) %>% 
  kable(booktabs = T, escape = F, longtable = T) %>% 
  kableExtra::collapse_rows(columns = 1, valign = "top") %>%
  kableExtra::kable_styling(latex_options = c("repeat_header"))
Main Factor Question fl1 fl2 description MATH_group
fl1 e14_E14 0.601 0.212 find minimum gradient of cubic B
fl1 e4_E3 0.537 0.215 completing the square A
fl1 e19_E19 0.509 0.233 area between curve and x-axis (in 2 parts) B
fl1 e7_E6 0.505 0.165 graphical vector sum B
fl1 e13_E13 0.498 0.246 equation of tangent A
fl1 e20_E20 0.493 0.183 product rule with given values B
fl1 e3_E2 0.468 0.148 composition of functions B
fl1 e12_E12 0.43 0.286 find point with given slope A
fl1 e10_E9 0.422 0.238 identify graph of rational functions B
fl1 e9_E8 0.405 0.305 simplify logs A
fl1 e5_E4 0.401 0.207 trig double angle formula A
fl1 e6_E5 0.373 0.104 trig wave function A
fl1 e1_E1 0.297 0.146 properties of fractions A
fl2 e17_E17 0.176 0.719 chain rule A
fl2 e16_E16 0.2 0.607 trig chain rule A
fl2 e18_E18 0.309 0.356 definite integral A
fl2 e15_E15 0.261 0.278 find and classify stationary points of cubic A

TODO interpretation

5 Predictive validity

course_results <- read_csv("data-uoe/ANON_2013-2017_course-results.csv", col_types = "ccddddddddd")

course_results_long <- course_results %>% 
  pivot_longer(cols = !c(AnonID, year), names_to = "course", values_to = "mark") %>% 
  filter(!is.na(mark)) %>% 
  separate(course, into = c("course_type", "course"), sep = "_") %>% 
  mutate(year = str_replace(year, "/", "-1"))

course_results_vs_diagtest <- course_results_long %>% 
  left_join(test_scores %>% select(AnonID, year, diagtest_score = Total, F1), by = c("AnonID", "year")) %>% 
  filter(!is.na(diagtest_score)) %>% 
  mutate(test_version = case_when(parse_number(year) >= 2017 ~ "Version 2", TRUE ~ "Version 1"))

We have both course results and diagnostic test scores for the following number of students:

course_results_vs_diagtest %>% 
  select(AnonID, year, test_version) %>% 
  distinct() %>% 
  janitor::tabyl(year, test_version) %>% 
  janitor::adorn_totals() %>% 
  gt()
year Version 1 Version 2
2013-14 635 0
2014-15 724 0
2015-16 555 0
2016-17 592 0
2017-18 0 719
Total 2506 719

5.1 Specialist courses

Mathematics students take linear algebra (ILA) in semester 1, then calculus (CAP) and a proofs course (PPS) in semester 2.

course_results_vs_diagtest %>% 
  filter(course_type == "spec") %>% 
  janitor::tabyl(year, course) %>% 
  gt()
year CAP ILA PPS
2013-14 286 302 170
2014-15 313 347 177
2015-16 236 257 168
2016-17 279 299 178
2017-18 322 353 196

This plot shows the results for the two versions of the test:

course_results_vs_diagtest %>% 
  filter(course_type == "spec") %>% 
  mutate(course = fct_relevel(course, "ILA", "CAP", "PPS")) %>% 
  ggplot(aes(x = diagtest_score, y = mark)) +
  geom_point(size = 0.8, stroke = 0, alpha = 0.5) +
  geom_smooth(method = lm, formula = "y ~ x") +
  ggpubr::stat_cor(label.y = 105, p.accuracy = 0.001) +
  facet_grid(cols = vars(course), rows = vars(test_version)) +
  theme_minimal() +
  theme(strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, face = "bold", angle = 0)) +
  labs(x = "Edinburgh MDT score", y = "Course result")

ggsave("output/uoe_pre-vs-post_regression-spec.pdf", units = "cm", width = 16, height = 10)

Another copy just focusing on the new version of the test:

course_results_vs_diagtest %>% 
  filter(course_type == "spec") %>% 
  filter(test_version == "Version 2") %>% 
  mutate(course = fct_relevel(course, "ILA", "CAP", "PPS")) %>% 
  ggplot(aes(x = diagtest_score, y = mark)) +
  geom_point(size = 0.8, stroke = 0, alpha = 0.5) +
  geom_smooth(method = lm, formula = "y ~ x") +
  ggpubr::stat_cor(label.y = 105, p.accuracy = 0.001) +
  facet_grid(cols = vars(course)) +
  theme_minimal() +
  theme(strip.text.x = element_text(size = 12)) +
  labs(x = "Edinburgh MDT score", y = "Course result")

ggsave("output/uoe_post_regression-spec.pdf", units = "cm", width = 16, height = 7)

This shows that the same general pattern of results is observed with the new version of the test (i.e. becoming less predictive from ILA to CAP to PPS), but with generally higher correlations.

5.2 Non-specialist courses

On the non-specialist side, students take the following courses:

  • MSE - for engineering and chemistry students, discontinued after 2014-15,
  • EM - for engineering students
  • MNS - for chemistry students

The courses come in two parts: 1a in semester 1, and 1b in semester 2.

course_results_vs_diagtest %>% 
  filter(course_type == "nonspec") %>% 
  mutate(year_and_version = str_glue("{year} ({test_version})")) %>% 
  janitor::tabyl(year_and_version, course) %>% 
  gt()
year_and_version EM1a EM1b MNS1a MNS1b MSE1a MSE1b
2013-14 (Version 1) 0 0 0 0 316 310
2014-15 (Version 1) 0 0 0 0 364 351
2015-16 (Version 1) 231 215 61 57 0 0
2016-17 (Version 1) 221 222 62 58 0 0
2017-18 (Version 2) 274 265 78 75 0 0

For the comparison of results, we drop the first two cohorts since it is only for 2015-16 onward that we have a consistent set of courses.

nonspec_results <- course_results_vs_diagtest %>% 
  filter(course_type == "nonspec") %>% 
  separate(course, into = c("course_family", "semester"), sep = "\\d", remove = FALSE) %>% 
  mutate(semester = ifelse(semester == "a", "Semester 1", "Semester 2")) %>% 
  mutate(course_family = fct_relevel(course_family, "MSE", "EM", "MNS")) %>% 
  filter(course_family != "MSE")
nonspec_results %>% 
  #filter(semester == "Semester 1") %>% 
  ggplot(aes(x = diagtest_score, y = mark)) +
  geom_point(size = 1, stroke = 0) +
  geom_smooth(method = lm, formula = "y ~ x") +
  ggpubr::stat_cor(label.y = 105, p.accuracy = 0.001) +
  facet_grid(rows = vars(test_version), cols = vars(course)) +
  theme_minimal() +
  theme(strip.text.x = element_text(size = 12),
        strip.text.y = element_text(size = 12, face = "bold", angle = 0)) +
  labs(x = "Edinburgh MDT score", y = "Course result")

ggsave("output/uoe_pre-vs-post_regression-nonspec.pdf", units = "cm", width = 18, height = 10)

Again, the correlation with course results is higher in all cases for the new version of the test.

About this report

This report supports the analysis in the following paper:

[citation needed]

Packages

In this analysis we used the following packages. You can learn more about each one by clicking on the links below.

  • mirt: For IRT analysis
  • psych: For factor analysis
  • tidyverse: For data wrangling and visualisation
  • reshape: For reshaping nested lists
  • vroom: For reading in many files at once
  • broom: For tidying model output
  • fs: For file system operations
  • gt: For formatting tables
  • knitr: For markdown tables
  • ggrepel: For labelling points without overlap
  • skimr: For data frame level summary
  • ggridges: For ridge plots